import pandas as pd
pd.set_option("display.max_columns", None)
import numpy as np
import time
import os
import caffeine
from tqdm import tqdm
import seaborn as sns
sns.set_theme(style="darkgrid", palette="Set1")
import klib
import plotly.graph_objects as go
import plotly.express as px
import plotly.figure_factory as ff
from plotly.colors import n_colors
import plotly.offline as pyo
import warnings
warnings.filterwarnings('ignore')
import folium
import re
import chart_studio
import chart_studio.plotly as py
# This is to diplay nicely pandas dataframes
def table (df, row_size=40, line_break=False):
if line_break == True:
for col in df.columns:
df[col] = df[col].apply(lambda x: x.replace(" ", "<br>") if type(x) == str else x)
table_data = [df.columns] + df.values.tolist()[:10]
return ff.create_table(table_data)
# Return a capitalized version of the dataframe
def capitalize(df):
for col in df.columns:
try:df[col] = df[col].str.title()
except: ...
return df
df_games = pd.read_csv("data/clean_games.csv")
df_events = pd.read_csv("data/clean_events.csv")
df_players = pd.read_csv("data/clean_players.csv")
df_players_info = pd.read_csv("data/clean_players_info.csv")
df_lineup = pd.read_csv("data/clean_lineup.csv")
# df_stats = pd.read_csv("data/clean_stats.csv")
Calculating the time spent
There is a rule that if a player leaves the pitch (subtitution or card) he can not return so that gives us some defined senarios:

# Creating substition of df_events, we don't need all the columns
sub_events = df_events[df_events["description"]\
.isin(["sub_on", "sub_off", "red_card", "2y_red_card"])]\
[["type", "description", "player_id", "match_id", "clock_label"]]
# Creating substition of df_players, we don't need all the columns
sub_players = df_players[["player_name", "player_id", "substitute", "match_id"]]
sub_merged = sub_players.merge(sub_events, on=["player_id", "match_id"], how="left").drop_duplicates()
# If player not substitute (starting game on the pitch, not on bench)
sub_merged["time_pitch"] = sub_merged[["clock_label", "description", "substitute"]]\
.apply(lambda x: (90 if pd.isnull(x["description"]) else x["clock_label"]) if x["substitute"] == 0 else 0,
axis=1)
sub_merged["time_pitch"] = sub_merged[["clock_label", "description", "substitute", "time_pitch"]]\
.apply(lambda x: (90-x["clock_label"] if x["description"]=="sub_on" else x["clock_label"]-90)\
if pd.notnull(x["description"]) and x["substitute"] == 1 else x["time_pitch"],
axis=1)
Merging time played on df_players
# Addding time for each player/game as some players subed on and red_card or sub off
sub_merged = sub_merged.groupby(["player_name", "match_id", "player_id"])\
.sum()["time_pitch"].sort_values(ascending=False).to_frame().reset_index()
df_players = df_players.merge(sub_merged[["player_id", "match_id", "time_pitch"]],
on=["player_id", "match_id"],
how="left")
df_players.head()
| match_position | team_id | info_loan | age | substitute | substit ute | captain | player_number | full_position | match_id | player_name | player_id | nateam | birth_date | birth_country_isoCode | birth_country | normal_position | time_pitch | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | d | 28 | NaN | 29.557988 | 0.0 | NaN | False | NaN | defender | 3405 | stefan schnoor | 1545 | germany | 1971-04-18 | de | germany | d | 90.0 |
| 1 | d | 28 | NaN | 29.436949 | 0.0 | NaN | False | NaN | defender | 3348 | stefan schnoor | 1545 | germany | 1971-04-18 | de | germany | d | 90.0 |
| 2 | d | 28 | NaN | 28.861989 | NaN | 1.0 | False | NaN | defender | 3162 | stefan schnoor | 1545 | germany | 1971-04-18 | de | germany | d | 0.0 |
| 3 | d | 28 | NaN | 28.919485 | NaN | 1.0 | False | NaN | defender | 3189 | stefan schnoor | 1545 | germany | 1971-04-18 | de | germany | d | 0.0 |
| 4 | d | 28 | NaN | 28.465563 | 0.0 | NaN | False | NaN | defender | 3002 | stefan schnoor | 1545 | germany | 1971-04-18 | de | germany | d | 90.0 |
Is there a number to position pattern in football? Do players wear a certain number if they play a certain position?
# Replacing letter by real position
replacement = {"g": "goalkeeper",
"d": "defender",
"m": "midfielder",
"f": "forward"}
df_players["normal_position"].replace(replacement, inplace=True)
df_players_info["normal_position"].replace(replacement, inplace=True)
# Creating dictionary with the noral position mode fo each number
player_num_position = {int(i):df_players[df_players.player_number==i]["normal_position"].mode()[0] for i in df_players["player_number"].unique() if pd.isnull(i)==False}
# Replacing numbers 99 as missing values
df_players["player_number"] = df_players["player_number"]\
.apply(lambda x: np.nan if x == 99 or x == -1 or pd.isnull(x) else int(x))
# # Number who used in the games more than 1,000 time.
sub_df = df_players["player_number"].value_counts().sort_values(ascending=False)
sub_df = sub_df[sub_df > 1000]
sub_df.index = list(map(lambda x: int(x), sub_df.index))
sub_df = sub_df.to_frame().reset_index()
sub_df["position"] = sub_df["index"].apply(lambda x: player_num_position[x])
players_number = px.bar(sub_df, x="index", y="player_number", color="position",
title="Number of time the most used numbers have been used (>1,000)")
players_number.update_xaxes(title_text="Player's number")
players_number.update_yaxes(title_text="Number of games it was used in")
pyo.init_notebook_mode()
pyo.iplot(players_number, filename = 'Number of time the most used numbers have been used (>1,000)')
# players_number.show()
# Creating a substitute df to merge
df_events_players = df_events.merge(df_players[["player_name", "player_id", "match_id"]],
how="left")
# Creating df with all the scorers
df_scorers = df_events_players[df_events_players["type"]=="goal"].groupby(["player_name", "player_id"])\
.count()\
.sort_values(by="match_id", ascending=False)\
.reset_index()\
[["player_name", "match_id", "player_id"]]\
.rename(columns={"match_id":"goals"})
# Creating df with all the assisters
df_assisters = df_events_players[df_events_players.assist_id.notna()].drop_duplicates(subset=["clock_label", "assist_id"])\
.groupby(["player_name", "player_id"])\
.count()\
.reset_index()\
[["match_id", "player_id"]]\
.rename(columns={"match_id":"assists"})
# Adding the number of assists to the scorers
df_scorers = df_scorers.merge(df_assisters,
how="left",
on="player_id").fillna(0)
# Adding players infos to df_scorers
df_scorers = df_scorers.merge(df_players_info[["birth_country", "player_id", "normal_position"]],
how="left")
df_scorers.index = list(range(1, len(df_scorers)+1))
df_scorers["assists"] = df_scorers["assists"].astype(int)
# Creating a new df of the scorers with at least 100 goals
df_scorers_100_goals = df_scorers[df_scorers["goals"] > 100]
more_goals_100 = px.bar(capitalize(df_scorers_100_goals),
x='player_name', y='goals',
hover_data=["goals", "assists", "birth_country", "normal_position"],
color='goals',
height=500,
color_continuous_scale=px.colors.sequential.Blues[4:],
title = "Players who scored more than 100 goals",
template="plotly_white", range_y=[70, 260])
more_goals_100.update_xaxes(title_text="Player's number", tickangle=-90)
more_goals_100.update_yaxes(title_text="Number of games it was used in")
more_goals_100.update(layout_coloraxis_showscale=False)
# py.plot(more_goals_100)
more_goals_100.show()
Most effective player (aka the one that had the less number of minutes per goal)
# Grouping time to have total time per player
df_times = df_players.groupby(["player_id", "normal_position"])\
.sum()\
.sort_values(by="time_pitch", ascending=False)\
.reset_index()\
[["player_id", "time_pitch"]]
# Merging scorers and their time together
df_goals_vs_time = df_scorers.merge(df_times, how="left", on="player_id").dropna()
# Getting minutes per goal
df_goals_vs_time["goal_every"] = round(df_goals_vs_time["time_pitch"] / df_goals_vs_time["goals"], 2)
df_goals_vs_time.head(5)
| player_name | goals | player_id | assists | birth_country | normal_position | time_pitch | goal_every | |
|---|---|---|---|---|---|---|---|---|
| 0 | alan shearer | 260 | 89.0 | 136 | england | forward | 37881.0 | 145.70 |
| 1 | wayne rooney | 208 | 2064.0 | 118 | england | forward | 36237.0 | 174.22 |
| 2 | andrew cole | 187 | 725.0 | 112 | england | forward | 30139.0 | 161.17 |
| 3 | sergio agüero | 184 | 4328.0 | 110 | argentina | forward | 18881.0 | 102.61 |
| 4 | frank lampard | 177 | 800.0 | 77 | england | midfielder | 47744.0 | 269.74 |
# subsetting to keep players who played at least 1000 minutes.
df_goals_vs_time = df_goals_vs_time[df_goals_vs_time["time_pitch"]>1000]
# Getting the top 20 players
df = df_goals_vs_time.sort_values("goal_every").reset_index().loc[:20].round(2)
fig = px.bar(df.sort_values('goal_every', ascending=False),
x='goal_every', y='player_name',
hover_data=["goals", "time_pitch", "normal_position", "goal_every"],
color='goal_every',
range_x=[100, 160],
height=500,
color_continuous_scale=px.colors.sequential.Blues[3:][::-1],
template="plotly_white",
title="Most effective players (lower is better)", barmode='stack')
fig.update_xaxes(title_text='Goals every (in minutes)')
fig.update_yaxes(title_text='Player name')
fig.update(layout_coloraxis_showscale=False)
# py.plot(fig)
fig.show()
Most effective player
goal_time = px.scatter(capitalize(df_goals_vs_time), y="goals", x="time_pitch",
color="normal_position", hover_name="player_name",
title="Minutes played vs Goals scores vs Player position", range_y=[-20, 270])
goal_time.update_xaxes(title_text='Time on pitch (in minutes)')
goal_time.update_yaxes(title_text='Number of goals inn career')
goal_time.update_layout(legend_title="Player's position")
goal_time.add_shape(
dict(type="circle", x0=-100, y0=-50, x1=20000, y1=100), row=1, col="all", line_color="orange")
# py.plot(goal_time)
goal_time.show()
# df_goals_vs_time.isna().sum()
This graph is what we could expect from it. Most players concentrated in what seems to be a triangle (0, 0), (12.5k, 50) and (20k, 0). There is a clear color orders for the number of goals scored: purple for goalkeepers, green for defenders, red for midfielders and then blue for forwards. It's logical and makes sense that the closer your position is from the opponent's goal the more likely you are to score a goal. This change if a player played a lot. For example, David Beckham (red point at (21k, 62)) scored more goals than most forward with less than 20,000 minutes played as he played way more than them (and was very good too).
The three most extreme points are:
You may not realize but 40,000 minutes played is the equivalent of playing every minute of every Premier League game for more than 11 seasons. Mind blowing.
TALK ABOUT HENRY, AGUERO, etc...
fig = px.scatter_3d(df_goals_vs_time, x='goals', y='time_pitch', z='assists',
color='normal_position',
labels={"time_pitch":"Minutes played",
"assists":"Number of assists",
"goals":"Number of goals"},
hover_data=df_goals_vs_time.columns,
title="Minutes played vs goals scores vs assists vs player's position")
fig.update_layout(legend_title="Player's position")
fig.show()
Goals vs assists over the years
sub_players = df_players[["match_id", "player_id", "time_pitch"]]
sub_events = df_events[df_events.type=="goal"][["player_id", "assist_id", "match_id"]]
sub_df = sub_events.set_index("match_id")\
.unstack().reset_index()\
.rename(columns={"level_0":"type", 0:"player_id"})\
.dropna(subset=["player_id"])
sub_df["goals"] = sub_df["type"].apply(lambda x: 1 if x == "player_id" else 0)
sub_df["assists"] = sub_df["type"].apply(lambda x: 1 if x == "assist_id" else 0)
sub_df.drop("type", axis=1, inplace=True)
sub_df = sub_df.groupby(["match_id", "player_id"])\
.sum()\
.reset_index()
sub_df = sub_df.merge(df_players_info[["player_id", "player_name", "normal_position"]],
how="left",
on="player_id")
match_id_season = dict(zip(df_games.match_id, df_games.season))
sub_df["season"] = sub_df["match_id"].apply(lambda x: match_id_season[x])
sub_df = sub_df.merge(df_players[["match_id", "player_id", "time_pitch"]],
how = "left",
on=["match_id", "player_id"])
sub_df = sub_df.groupby(["player_name", "normal_position", "season"]).sum().reset_index().sort_values("season")
color=["#636efb", "#ef553b", "#00cc96", "#ab63fa"]
diff = ["forward", "midfielder", "defender", "goalkeeper"]
colors = {}
for index, x in enumerate(color):
colors[diff[index]] = f"rgb{str(px.colors.hex_to_rgb(x))}"
assist_goals = px.scatter(sub_df,
y="goals", x="assists", color="normal_position",
animation_frame="season", hover_name="player_name", size="time_pitch",
color_discrete_map=colors, range_y=[0, 40],
title = "Goals vs assists per season")
assist_goals.update_xaxes(title_text="Assists")
assist_goals.update_yaxes(title_text="Goals")
assist_goals.update_layout(legend_title="Player's position")
# py.plot(assist_goals)
assist_goals
Best goalkeeper
home = df_games[["home_team_id", "away_score", "match_id", "season", "home_team"]].values.tolist()
away = df_games[["away_team_id", "home_score", "match_id", "season", "away_team"]].values.tolist()
df_team_score = home + away
df_team_score = pd.DataFrame(df_team_score, columns = ["team_id", "score", "match_id", "season", "team"])
clean_sheet = df_team_score[df_team_score.score==0]
clean_sheet[(clean_sheet.season=="2004/05") & (clean_sheet.team_id==4)]
goalkeepers = df_players[(df_players.normal_position=="goalkeeper") &\
(df_players.substitute==0)]\
[["team_id", "match_id", "player_name"]]
clean_sheet = clean_sheet.merge(goalkeepers, how="left", on=["team_id", "match_id"])
clean_sheet = clean_sheet.groupby(["season", "player_name", "team"]).count().reset_index()\
.sort_values(["season", "team_id"], ascending=[1, 0])
clean_sheet = clean_sheet[clean_sheet.team_id>9]
fig = px.bar(capitalize(clean_sheet), y="team_id", x="player_name", color = "team_id",
color_continuous_scale=px.colors.sequential.BuPu[4:],
animation_frame="season", hover_name="player_name", range_y=[0, 25],
title="Cleansheets per season (at leat 10)", hover_data=["team"])
fig.update_xaxes(title_text=None, tickangle=40)
fig.update_yaxes(title_text="Cleansheet count")
fig.update(layout_coloraxis_showscale=False)
# py.plot(fig)
fig
Avg minutes for goal per position
sub_df = df_goals_vs_time.groupby("normal_position").mean()["goal_every"]\
.round(0)\
.sort_values()
for x in sub_df.index:
print(f"On average, a {x} scores a goal every {int(sub_df[x]):,} minutes.")
On average, a Forward scores a goal every 340 minutes. On average, a F scores a goal every 1,228 minutes. On average, a Midfielder scores a goal every 1,240 minutes. On average, a D scores a goal every 1,455 minutes. On average, a Defender scores a goal every 3,049 minutes. On average, a Goalkeeper scores a goal every 28,048 minutes.
Who's the best duo ever? Which combinaison of two players passed anbd s
players = {}
sus = df_players.drop_duplicates(subset=["player_name", "player_id"]).reset_index(drop=True)
for x in tqdm(range(len(sus))):
a = sus.loc[x , ["player_name", "player_id"]].to_list()
players[a[1]] = a[0]
df = df_events[["player_id", "assist_id"]].dropna()
df = df.replace(players)
df["combi"] = df["player_id"].astype(str) + " - " + df["assist_id"].astype(str)
df_combi = pd.DataFrame()
for x in tqdm(df["combi"]):
df_combi.loc[x, "count"] = df["combi"].to_list().count(x)
df_combi = df_combi.reset_index().sort_values("count", ascending=False).head(20)
100%|██████████| 4863/4863 [00:00<00:00, 6364.89it/s] 100%|██████████| 20337/20337 [00:08<00:00, 2291.37it/s]
df_combi["count"] = df_combi["count"].astype(int)
df_combi["index"] = df_combi["index"].str.title()
table(df_combi.rename(columns={"index":"Players: scorer - assist"}))
best_scorers_country = df_scorers.merge(df_players_info[["player_id", "birth_country"]])\
.sort_values("goals", ascending=False)\
.drop_duplicates(subset=["birth_country"], keep="first")\
[["player_name", "goals", "birth_country"]]
best_assisters_country = df_scorers.merge(df_players_info[["player_id", "birth_country"]])\
.sort_values("goals", ascending=False)\
.drop_duplicates(subset=["birth_country"], keep="first")\
[["player_name", "assists", "birth_country"]]
best_assisters_country
| player_name | assists | birth_country | |
|---|---|---|---|
| 0 | alan shearer | 136 | england |
| 3 | sergio agüero | 110 | argentina |
| 5 | thierry henry | 99 | france |
| 12 | robin van persie | 78 | netherlands |
| 13 | jimmy floyd hasselbaink | 73 | suriname |
| ... | ... | ... | ... |
| 2198 | ragnar klavan | 1 | estonia |
| 2232 | mbwana samatta | 0 | tanzania |
| 1973 | emmanuel rivière | 1 | martinique |
| 1961 | domingos quina | 1 | guinea-bissau |
| 2150 | paul tisdale | 1 | malta |
109 rows × 3 columns
df_scorers
| player_name | goals | player_id | assists | birth_country | normal_position | |
|---|---|---|---|---|---|---|
| 1 | alan shearer | 260 | 89.0 | 136 | england | forward |
| 2 | wayne rooney | 208 | 2064.0 | 118 | england | forward |
| 3 | andrew cole | 187 | 725.0 | 112 | england | forward |
| 4 | sergio agüero | 184 | 4328.0 | 110 | argentina | forward |
| 5 | frank lampard | 177 | 800.0 | 77 | england | midfielder |
| ... | ... | ... | ... | ... | ... | ... |
| 2452 | jelle van damme | 1 | 2795.0 | 1 | belgium | defender |
| 2453 | jefferson montero | 1 | 10518.0 | 1 | ecuador | midfielder |
| 2454 | roger espinoza | 1 | 4699.0 | 0 | honduras | midfielder |
| 2455 | jeff whitley | 1 | 1059.0 | 1 | northern ireland | midfielder |
| 2456 | özalan alpay | 1 | 1839.0 | 0 | turkey | defender |
2456 rows × 6 columns
# creating gradient color
gradient = "E63946 E63946 F26430 CDEAE5 A8DADC 90C3CD 77ABBD 92AFD7 1D3557"
gradient = ["#"+x.lower() for x in gradient.split(" ")]
# gradient = (gradient*10)[:15]
gradient
gradient
# Renaming countries to make them fit with choropleth's country list
df = df_players_info.copy()
not_in = ['faroe islands', 'cape verde', 'curacao', np.nan, 'montserrat', 'guadeloupe', 'bermuda', "reunion",
'barbados', 'martinique', 'malta', 'grenada', 'antigua & barbuda', "st. kitts & nevis"]
country_replace = {'united states': 'united states of america',
'congo - kinshasa': "congo",
'wales':"United Kingdom",
'scotland' : "United Kingdom",
'northern ireland' : "United Kingdom",
'cote d’ivoire' :"Côte d'Ivoire",
'central african republic': "Central African Rep.",
'england': "United Kingdom",
'bosnia & herzegovina': "Bosnia and Herz.",
"trinidad & tobago": "Trinidad and Tobago",
'north macedonia': "Macedonia",
'yugoslavia': "Serbia"}
df["birth_country"] = df["birth_country"].replace(country_replace).to_list()
best_scorers_country = df_scorers.merge(df_players_info[["player_id", "birth_country"]])\
.sort_values("goals", ascending=False)\
.drop_duplicates(subset=["birth_country"], keep="first")\
[["player_name", "goals", "birth_country"]]\
.rename(columns={"player_name":"Best scorer"})
best_assisters_country = df_scorers.merge(df_players_info[["player_id", "birth_country"]])\
.sort_values("goals", ascending=False)\
.drop_duplicates(subset=["birth_country"], keep="first")\
[["player_name", "assists", "birth_country"]]\
.rename(columns={"player_name":"Best assister"})
best_scorers_country["birth_country"] = best_scorers_country["birth_country"].replace(country_replace).to_list()
best_assisters_country["birth_country"] = best_assisters_country["birth_country"].replace(country_replace).to_list()
# display(best_scorers_country)
# print(uhij)
country_count = {i:df["birth_country"].to_list().count(i) for i in df["birth_country"].unique()}
country_count = pd.DataFrame(country_count, index=["count"]).T.reset_index()\
.rename(columns={"index":"birth_country"})
country_count = country_count.merge(best_scorers_country,
how="left").merge(best_assisters_country,
how="left")
country_count["birth_country"] = country_count["birth_country"].str.title()
country_count = country_count.rename(columns={"birth_country":"Country",
"goals":"Best scorer goals",
"assists":"Best assister assists",
"count": "Number of player from"})
country_count
world = px.choropleth(capitalize(country_count), locations="Country", locationmode="country names",
color="Number of player from", # lifeExp is a column of gapminder
hover_name="Country", # column to add to hover information
color_continuous_scale=gradient[::-1],
range_color=[10, 500],
title="Origin of the players", hover_data=["Best scorer", "Best scorer goals",
"Best assister", "Best assister assists"])
# py.plot(world)
world.show()
It's not a surprise that most players come from The UK. The second country being France, Spain.
I'm quite surprised that that many players come from the US as it's not a country that you think may generate male football players (female are more into football than male).
Do players play for the country they are born in? (check not null values)
# Players who played for the country they were born in
##########
sub_df = df_players[["nateam", "birth_country", "player_id"]].drop_duplicates().reset_index(drop=True)
lst = [1 for x in range(len(sub_df)) if sub_df.loc[x, "birth_country"]==sub_df.loc[x, "nateam"]]
same = lst.count(1)
diff = len(df_players) - same
print(f"Among {len(sub_df):,} players, {same:,} played for the country they were born in.")
Among 4,863 players, 4,474 played for the country they were born in.
Here are some stats that deverve to be told but don't really deserve a big part
Best substitute Most used player who started on the bench
df = df_players.merge(df_events[df_events["description"]=="sub_on"][["description", "player_id", "match_id"]],
how="right")
df = df.groupby(["player_name", "player_id"])\
.sum()["time_pitch"]\
.to_frame()\
.reset_index()\
.sort_values("time_pitch", ascending=False)
sub_players = df_players.groupby(["player_name", "player_id"])\
.sum()["time_pitch"]\
.to_frame()\
.reset_index()\
.sort_values("time_pitch", ascending=False)
sub_players.rename(columns={"time_pitch": "total_time_pitch"}, inplace=True)
df = df.merge(sub_players, on=["player_name", "player_id"], how="left")
df["ratio"] = (df["total_time_pitch"] / df["time_pitch"]).round(0)
df.head()
# sub_players
| player_name | player_id | time_pitch | total_time_pitch | ratio | |
|---|---|---|---|---|---|
| 0 | aaron connolly | 21653 | 0.0 | 1630.0 | inf |
| 1 | michael meaker | 459 | 0.0 | 1732.0 | inf |
| 2 | michael oakes | 34 | 0.0 | 6264.0 | inf |
| 3 | michael obafemi | 21532 | 0.0 | 590.0 | inf |
| 4 | michael owen | 1208 | 0.0 | 21638.0 | inf |
df = df_players.merge(df_events[df_events["description"]=="sub_oon"][["description", "player_id", "match_id"]],
how="right")
Enemy of own club: who scored the most own goals?
player_id = df_events[df_events["description"]=="own_goal"]["player_id"].value_counts().index[0]
count = df_events[df_events["description"]=="own_goal"]["player_id"].value_counts().values[0]
player_name = df_players[df_players["player_id"]==player_id]["player_name"].values[0]
print(f"{player_name.title()} scored {count} own-goals during his career in Prmier League.")
Richard Dunne scored 10 own-goals during his career in Prmier League.
df_scorers = df_events[df_events["type"]=="goal"][["player_id", "match_id"]]\
.merge(df_players[["player_name", "age", "player_id", "match_id"]])
df_scorers = df_scorers.sort_values(["player_name", "age"])
# df_scorers["player_name"].value_counts()[0]
df_goal_age = pd.DataFrame(columns = df_scorers["player_name"].unique(),
index = list(range(1, df_scorers["player_name"].value_counts()[0]+1)))
for index, player in enumerate(tqdm(df_scorers["player_name"])):
player_goals = df_scorers[df_scorers["player_name"]==player]
df_goal_age.loc[list(range(1, len(player_goals)+1)), player] = player_goals["age"].to_list()
# display(player_goals, df_goal_age)
# break
df_goal_age.fillna(100, inplace=True)
100%|██████████| 28915/28915 [00:30<00:00, 935.46it/s]
df_scorers_record = pd.DataFrame(columns=["player_name", "age", "count", "goal"])
goals = [1] + list(range(0, 250, 25))[1:]
for goal in goals:
for player, age in df_goal_age.loc[goal, ].sort_values().items():
df_scorers_record.loc[len(df_scorers_record)] = [player, age, count, goal]
break
df_scorers_record = df_scorers_record[df_scorers_record["age"]!=100]
df_scorers_record["age_num"] = df_scorers_record["age"].apply(lambda x:\
str(int(x)) + " years, " + str(int((x-int(x))*365)) + "days")
goal_age = px.bar(capitalize(df_scorers_record), x="goal", y="age", color="player_name", barmode='group',
hover_name="player_name", hover_data=["age_num"],
title="Youngest players to score X goals")
goal_age.update_xaxes(title_text='Goals count')
goal_age.update_yaxes(title_text='Player age')
goal_age.update_layout(legend_title="Player name")
df_games["season"].unique()
array(['2013/14', '2007/08', '1995/96', '2010/11', '2003/04', '1997/98',
'2005/06', '2001/02', '2018/19', '1993/94', '2008/09', '2019/20',
'2015/16', '1999/00', '2004/05', '2017/18', '2012/13', '2011/12',
'2016/17', '1994/95', '2020/21', '1992/93', '1996/97', '2009/10',
'2014/15', '2002/03', '1998/99', '2006/07', '2000/01'],
dtype=object)
qual_releg = {}
# Looping through every table on the wiki page of the season
for season in tqdm(df_games["season"].unique()):
try:
url = f"https://en.wikipedia.org/wiki/{season[:4]}%E2%80%93{season[-2:]}_Premier_League"
tables = pd.read_html(url)
except:
url = f"https://en.wikipedia.org/wiki/{season[:4]}%E2%80%93{season[-2:]}_FA_Premier_League"
tables = pd.read_html(url)
for wiki_table in tables:
# I want the ranking table so I check the one with this column (there is only on table)
if "Qualification or relegation" in wiki_table.columns:
wiki_table["Pos"] = wiki_table["Pos"].astype(str)
wiki_table = wiki_table[wiki_table["Pos"].str.isdigit()]
qualifs = wiki_table["Qualification or relegation"]
qualifs = qualifs.str.lower().fillna("").to_list()
qual_releg[season] = qualifs
100%|██████████| 29/29 [00:24<00:00, 1.16it/s]
Creating a ranking dictionary To have all the rankings from each season in one place, I'm going to store in a dictionary each season as a key and each ranking (pd.DataFrame) as a value.
# Defining the interesting columns
sub_df = df_games[["season", "home_team", "away_team", "home_score", "away_score"]]
champs = {}
# Looping through every season
for season in tqdm(sub_df["season"].unique()):
year_df = sub_df[sub_df["season"]==season].reset_index(drop=True)
teams = year_df["home_team"].unique()
ranking = pd.DataFrame(index = teams, columns = ["points", "W", "D", "L", "GF", "GA", "GD"])
ranking.fillna(0, inplace=True)
for index in range(len(year_df)):
htg = year_df.loc[index, "home_score"] # home team goals
atg = year_df.loc[index, "away_score"]
ht = year_df.loc[index, "home_team"] # home team
at = year_df.loc[index, "away_team"]
if htg > atg:
ranking.loc[ht, "points"] += 3
ranking.loc[ht, "W"] += 1
ranking.loc[at, "L"] += 1
elif htg < atg:
ranking.loc[at, "points"] += 3
ranking.loc[at, "W"] += 1
ranking.loc[ht, "L"] += 1
else:
ranking.loc[at, "points"] += 1
ranking.loc[ht, "points"] += 1
ranking.loc[[ht, at], "D"] += 1
# Goals for
ranking.loc[ht, "GF"] += htg
ranking.loc[at, "GF"] += atg
# Goals against
ranking.loc[ht, "GA"] += atg
ranking.loc[at, "GA"] += htg
# Goals difference
ranking.loc[ht, "GD"] += htg - atg
ranking.loc[at, "GD"] += atg - htg
ranking["played"] = ranking["W"] + ranking["D"] + ranking["L"]
ranking = ranking.sort_values(["points", "GD"], ascending=False)
ranking.insert(loc = 0,
column = 'team',
value = ranking.index)
# The index being the rank of the team
ranking.index = list(range(1, 23)) if season in ["1992/93", "1993/94", "1994/95"] else list(range(1, 21))
try:
ranking["qual/releg"] = qual_releg[season]
except:
print(qual_releg[season])
display(ranking)
champs[season] = ranking
100%|██████████| 29/29 [00:03<00:00, 7.41it/s]
Which season was the tightest? How many points/goals of difference between the top 5 or even top 2?
top_2 = {}
for season, ranking in champs.items():
# The 7 top teams may play UCL/UEL
points_top7 = ranking.loc[list(range(1, 8)), "points"].to_list()
point_diff = [points_top7[x] - points_top7[x+1] for x in range(len(points_top7)-1)]
top_2[season] = point_diff[0]
year, diff = sorted(top_2.items(), key=lambda x: x[1])[0]
print(f"The season {year} was the tightest with {diff} points of difference!")
# cap11_12 = capitalize(champs[year].copy())
ranking_11_12 = table(capitalize(champs[year].copy()).iloc[:, :-1].head(2), 60)
ranking_11_12
# py.plot(ranking_11_12)
The season 2011/12 was the tightest with 0 points of difference!
That season was crazy! Imagine, after 38 games, to teams have the exact same number of points. If you look at the ranking table, they actually had the exact same number of wins, draws and looses. Manchester City won because of a difference of 8 goals. This is crazy and almost never happens but where did MU lost points?
# Subsetting by the year
sub_df = df_games[df_games["season"] == year][["home_team", "home_score", "away_score", "away_team",
"outcome", "home_htscore", "away_htscore",
"home_team_abbr", "away_team_abbr", "stadium", "match_id"]]
# Subsetting by MMU playing home AND outcome being A (away) or D (Draw) or
# MMU playing away AND outcome being H (home) or D (Draw)
print("Manchest United could have won the title if they would have taken points on those games: ")
mu_looses = sub_df[(sub_df["home_team_abbr"].str.contains("mun") & sub_df["outcome"].str.contains("|".join(["a", "d"]))) |\
(sub_df["away_team_abbr"].str.contains("mun") & sub_df["outcome"].str.contains("|".join(["h", "d"])))]
mu_looses
Manchest United could have won the title if they would have taken points on those games:
| home_team | home_score | away_score | away_team | outcome | home_htscore | away_htscore | home_team_abbr | away_team_abbr | stadium | match_id | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 233 | manchester united | 2 | 3 | blackburn rovers | a | 0 | 1 | mun | blb | old trafford | 7650 |
| 2186 | stoke city | 1 | 1 | manchester united | d | 0 | 1 | stk | mun | bet365 stadium | 7521 |
| 2202 | liverpool | 1 | 1 | manchester united | d | 0 | 0 | liv | mun | anfield | 7537 |
| 5101 | manchester united | 1 | 6 | manchester city | a | 0 | 1 | mun | mci | old trafford | 7554 |
| 5567 | manchester united | 4 | 4 | everton | d | 1 | 1 | mun | eve | old trafford | 7810 |
| 5673 | wigan athletic | 1 | 0 | manchester united | h | 0 | 0 | wig | mun | dw stadium | 7794 |
| 6746 | manchester city | 1 | 0 | manchester united | h | 1 | 0 | mci | mun | etihad stadium | 7822 |
| 7264 | manchester united | 1 | 1 | newcastle united | d | 0 | 0 | mun | new | old trafford | 7589 |
| 8741 | newcastle united | 3 | 0 | manchester united | h | 1 | 0 | new | mun | st james' park | 7665 |
| 9070 | chelsea | 3 | 3 | manchester united | d | 1 | 0 | che | mun | stamford bridge | 7704 |
They did loose twice against Man City with a lost of 6-1 at Old Trafford (MU's stadium).
This is quite surprising and as said, never happens but wait, there is more.
last_game_id = df_games[(df_games["season"]=="2011/12") & (df_games["home_team_abbr"]=="mci")]\
.sort_values("kickoff_dt").tail(1)["match_id"].squeeze()
sub_df = df_events[["clock_label", "home_team_score", "away_team_score", "match_id", "type"]]
sub_df[(sub_df["match_id"]==last_game_id) & (sub_df["type"]=="goal")]\
.rename(columns={"score_homeScore":"Manchester City", "score_awayScore":"Queens Park Rangers"})\
.drop(["match_id", "type"], axis=1)
| clock_label | home_team_score | away_team_score | |
|---|---|---|---|
| 41311 | 39 | 1 | 0 |
| 41316 | 48 | 1 | 1 |
| 41320 | 66 | 1 | 2 |
| 41328 | 90 | 2 | 2 |
| 41329 | 90 | 3 | 2 |
Yes, what you see is right, on a season that lasted more than 9 months, Manchester United lost the title on the last game, four minutes before the end. Tragedy.
You can watch the last 10 minutes of the game and how the players and fans go crazy when Agüero scores the last goal in the added time.
One more thing. These two teams WALKED on the Premier League this year with a difference of 19 points from the third team. 19 points is the equivalent of 6 wins plus 1 draw where the other lost 7 games. Champions.
for season, ranking in champs.items():
if ranking.iloc[-3, 1] ==49:
print(season)
1992/93
#######
# Points of difference fro meach team
year = "2011/12"
points = list(champs[year]["points"])
values = [0] + [points[x] - points[x+1] for x in range(len(points)-1)][::-1]
teams = champs[year]["team"][::-1].to_list()
waterfall = go.Figure(go.Waterfall(
name = "20", orientation = "v",
x = list(map(lambda x: x.title(), teams)),
textposition = "outside",
text = list(map(lambda x: "+" + str(x) if x > 0 else str(x), values)),
y = values))
waterfall.update_layout(
title = f"Points of difference with the team ranked after itself. Season: {year} - Winner: {teams[-1]}")
waterfall.update_yaxes(title_text="Points difference")
waterfall.update_xaxes(title_text="Team", tickangle=-90)
waterfall.update_layout(height=650)
# py.plot(waterfall)
waterfall.show()
There were only 6 points of difference between the three next. Incredible.
Okay, Manchester United lost against their best rival but did they win more Premier Leagues in the last 26 years?
# Creating a new dataframe
df_podiums = pd.DataFrame(columns=["team", "place", "season"])
for season, ranking in champs.items():
top3 = ranking.iloc[:3, ]
season = "19"+season[-2:] if season[0]=="1" else "20"+season[-2:]
season = "2000" if season == "1900" else season
for index, team in enumerate(top3["team"]):
if any([team, index+1]==x for x in df_podiums[["team", "place"]].values.tolist()):
df_podiums.loc[(df_podiums.team==team) & (df_podiums.place==index+1), "season"] += ", "+season
else:
df_podiums.loc[len(df_podiums)] = [team, index+1, season]
df_podiums["count"] = df_podiums["season"].apply(lambda x: x.count(",")+1)
df_podiums = df_podiums.sort_values(["count", "place"], ascending=[0, 1])
df_podiums["place"] = df_podiums["place"].apply(lambda x: \
"Champion" if x==1 else("Runner-up" if x==2 else "2nd runner-up (3rd)"))
df_podiums = df_podiums.sort_values("count", ascending=False)
df_podiums.columns = list(map(lambda x: x.title(), df_podiums.columns))
podiums = px.bar(capitalize(df_podiums), x="Team", y="Count",
color='Place', barmode='group',
height=500, hover_name="Season",
title="Number of time each team has finished at which place of the podium")
podiums.update_xaxes(title_text='Team name')
podiums.update_yaxes(title_text='Count')
# py.plot(podiums)
podiums.show()
Of course they did! Since 1995, Manchester United won 11 times the Premier League and MCI 5 times! MU didn't win since 2013 and MCI didn't (not actually true, explain why).
Special mention to Leicester, one time on the podium, one time champion. You may not realize but this had almost no chance to happen, at the beginning of the season, the odds where 5001/1 which mean that 10 Dollars bet on Leicester would have returned you 50,010 Dollars. Unpredictable.
def minmax_season(element, min_max):
season = ""
variable = 0 if min_max == "max" else 1000
for year, ranking in champs.items():
maax = ranking[element].max()
miin = ranking[element].min()
if maax > variable and min_max == "max":
season = year
variable = maax
elif miin < variable and min_max == "min":
# print(variable)
season = year
variable = miin
sub_df = champs[season].copy()
print(f"Season: {season}")
if min_max == "max": display(sub_df[sub_df[element] == sub_df[element].max()])
elif min_max == "min": display(sub_df[sub_df[element] == sub_df[element].min()])
Even if I support neither MU neither MC, I still feel sad for MU so lets look at the greatest record and not show which team had the least points, least wins or max looses.
Max points
Again, City won the 2017/18 season with a record of three digits threshold. {find an adjective}
minmax_season("points", "max")
Season: 2017/18
| team | points | W | D | L | GF | GA | GD | played | qual/releg | |
|---|---|---|---|---|---|---|---|---|---|---|
| 1 | manchester city | 100 | 32 | 4 | 2 | 106 | 27 | 79 | 38 | qualification for the champions league group s... |
Did they break the record for the most wins in a season though?
Max wins
minmax_season("W", "max")
Season: 2018/19
| team | points | W | D | L | GF | GA | GD | played | qual/releg | |
|---|---|---|---|---|---|---|---|---|---|---|
| 1 | manchester city | 98 | 32 | 2 | 4 | 95 | 23 | 72 | 38 | qualification for the champions league group s... |
Of course they did...
Less looses
I know that City doesn't have this record. Even if they lost only two games in the year they broke the points record, Arsenal did something fantastic more than a decade before.
minmax_season("L", "min")
Season: 2003/04
| team | points | W | D | L | GF | GA | GD | played | qual/releg | |
|---|---|---|---|---|---|---|---|---|---|---|
| 1 | arsenal | 90 | 26 | 12 | 0 | 73 | 26 | 47 | 38 | qualification for the champions league group s... |
If you know a little bit about football or follow it recently, you know that Arsenal has become a meme for its weak performances. This former great team, part of the BIG 6, had some difficulty recently but did one thing that will stay in history forever. Arsenal did not loose a single match during the season 2003/04 and for that they had the greatest price: a golden Premier League trophy. The photo below is showing Arsene Wenger, Arsenal's former manager lifting the golden tropy. He served the team for 22 years, 1235 games (707W - 280D - 248L). When you see that managers now rarely stay more than four or five years, it really makes you think about the past and realize tat football has drastically changed.

Least goals against
Which goalkeeper has been amazing?
minmax_season("GA", "min")
Season: 2004/05
| team | points | W | D | L | GF | GA | GD | played | qual/releg | |
|---|---|---|---|---|---|---|---|---|---|---|
| 1 | chelsea | 95 | 29 | 8 | 1 | 72 | 15 | 57 | 38 | qualification for the champions league group s... |
Jesus! In 04/05 Chelsea took only 15 goals! Who played in defense and who was the goalkeeper?
# Defining Chelsea's id
chelsea_id = df_games[df_games["home_team"]=="chelsea"]["home_team_id"].mean()
print(chelsea_id)
# Games id in 2004/05
gamesid_0405 = df_games[df_games["season"]=="2004/05"]["match_id"]
# Chealsea 2004/05 players
sub_df = df_players[(df_players["match_id"].isin(gamesid_0405)) & \
(df_players["team_id"]==chelsea_id)]
# display(sub_df)
sub_df = sub_df.groupby(["player_name", "normal_position", "player_id"]).sum()["time_pitch"].to_frame().reset_index()
sub_df[sub_df["normal_position"]=="goalkeeper"]
4.0
| player_name | normal_position | player_id | time_pitch | |
|---|---|---|---|---|
| 4 | carlo cudicini | goalkeeper | 1676 | 262.0 |
| 17 | lenny pidgeley | goalkeeper | 2231 | 0.0 |
| 22 | petr cech | goalkeeper | 2651 | 3150.0 |
Three goalkeepers played for Chelsea during this season: Petr Chech 92% of the time and Carlo Cudicini the rest of the time with Lenny Pidgeley who only played 23 minutes.
# Retrieving all chelsea players
chelsea_playersid = df_players[df_players["team_id"]==chelsea_id]["player_id"].unique()
# All games Petr Chech played
petr_gameid = df_players[(df_players["player_id"]==2651) &\
(df_players["match_id"].isin(gamesid_0405))]\
["match_id"]
# # All games total goals for opponant
apponents_goals = df_events[(df_events["type"]=="goal") &\
(df_events["match_id"].isin(petr_gameid)) &\
(~df_events["player_id"].isin(chelsea_playersid))]
# display(apponents_goals)
print(f"Games played: {len(petr_gameid)}")
print(f"Clean sheets: {len(petr_gameid) - apponents_goals.shape[0]}")
Games played: 35 Clean sheets: 23
Over 35 games (90mn * 35 = 3,150 time_pitch from the precedent output), Petr Cech did 23 clean sheets. I'm honestly a little bit disappointed as I thought he was going to have a better ratio (~80%) but we can see that he never took more than 2 goals so it makes sense! Legend.
sub_df = pd.DataFrame(columns = ["season", "team", "rank"])
for season, ranking in champs.items():
for rank in ranking.index:
sub_df.loc[len(sub_df)] = [season, ranking.loc[rank, "team"], int(rank)]
teams = ["manchester united", "liverpool", "arsenal",
"tottenham hotspur", "chelsea", "manchester city"]
sub_df = sub_df[sub_df["team"].isin(teams)]
sub_df["rank"] = sub_df["rank"].astype(int)
# sub_df["rank"] = sub_df["rank"].apply(lambda x: 10 + (10-x))
sub_df = sub_df.sort_values("season")
sub_df.season.unique()
array(['1992/93', '1993/94', '1994/95', '1995/96', '1996/97', '1997/98',
'1998/99', '1999/00', '2000/01', '2001/02', '2002/03', '2003/04',
'2004/05', '2005/06', '2006/07', '2007/08', '2008/09', '2009/10',
'2010/11', '2011/12', '2012/13', '2013/14', '2014/15', '2015/16',
'2016/17', '2017/18', '2018/19', '2019/20', '2020/21'],
dtype=object)
big6 = ["manchester city", "manchester united", "liverpool", "tottenham hotspur", "chelsea", "arsenal"]
df_points = pd.DataFrame(columns = df_games["home_team"].unique(),
index = df_games["season"].unique())
for season, ranking in champs.items():
for team in ranking["team"]:
# points = ranking.loc[ranking.team==team, "points"].squeeze()
df_points.loc[season, team] = ranking["team"].to_list().index(team) + 1
# df_points = df_points.fillna(0)
df_points = df_points[big6]
df_points
# # plotly
fig = go.Figure()
colors = px.colors.qualitative.Plotly
df = df_points
# set up multiple traces
for col in df.columns:
fig.add_trace(go.Scatter(x=df.index,
y=df[col],
name = col,
visible=True,
)
)
um = [ {} for _ in range(len(df.columns)) ]
buttons = []
menuadjustment = 0.10
buttonX = -0.05
buttonY = 1 + menuadjustment
for i, col in enumerate(df.columns):
button = dict(method='restyle',
label=col,
visible=True,
args=[{'visible':True,
'line.color' : colors[i]}, [i]],
args2 = [{'visible': False,
'line.color' : colors[i]}, [i]],
)
# adjust some button features
buttonY = buttonY-menuadjustment
um[i]['buttons'] = [button]
um[i]['showactive'] = False
um[i]['y'] = buttonY
um[i]['x'] = buttonX
# add a button to toggle all traces on and off
button2 = dict(method='restyle',
label='All',
visible=True,
args=[{'visible':True}],
args2 = [{'visible': False}],
)
# assign button2 to an updatemenu and make some adjustments
um.append(dict())
um[i+1]['buttons'] = [button2]
um[i+1]['showactive'] = True
um[i+1]['y']=buttonY - menuadjustment
um[i+1]['x'] = buttonX
# add dropdown menus to the figure
# print(um)
fig.update_layout(title="Ranking of the Big 6 per year", showlegend=True, updatemenus=um)
# adjust button type
for m in fig.layout.updatemenus:
m['type'] = 'buttons'
fig['layout']['yaxis']['autorange'] = "reversed"
lst = list(range(1, 21))
fig.update_layout(
yaxis = dict(
tickmode = 'array',
tickvals = lst[::2],
ticktext = list(map(lambda x: str(x), lst))[::2]
)
)
fig.update_xaxes(title_text="Season", tickangle=-90)
# py.plot(fig)
fig.show()
df_games["season"].value_counts()
1992/93 462 1993/94 462 1994/95 462 2013/14 380 2017/18 380 2006/07 380 1998/99 380 2002/03 380 2014/15 380 2009/10 380 1996/97 380 2020/21 380 2016/17 380 2011/12 380 2012/13 380 2004/05 380 2007/08 380 1999/00 380 2015/16 380 2019/20 380 2008/09 380 2018/19 380 2001/02 380 2005/06 380 1997/98 380 2003/04 380 2010/11 380 1995/96 380 2000/01 380 Name: season, dtype: int64
Deep explanation of what happened
here some quick facts, information where you may or may not find interest in but...
Avg & max nb or points for relegation
points_releg = []
releg_teams = []
for season, ranking in champs.items():
releg = "relegation to the football league first division"
releg_ranking = ranking[ranking["qual/releg"].str.contains("releg")]
points_releg += releg_ranking["points"].to_list()
releg_teams.extend(releg_ranking["team"].to_list())
print(f"""The average points to be relagated is {np.mean(points_releg):.0f} points.
The maximum that ever happened was {max(points_releg)}.""")
The average points to be relagated is 32 points. The maximum that ever happened was 49.
releg_count = pd.DataFrame({i:[releg_teams.count(i)] for i in set(releg_teams)}).T.reset_index()
releg_count.columns = ["team", "count"]
releg_count
fig = px.bar(capitalize(releg_count).sort_values("count", ascending=False), x='team', y='count',
title="Relagated teams count")
fig.update_xaxes(title_text="Team name", tickangle=-90)
fig.update_yaxes(title_text="Count")
fig.show()
Comeback after relegation
rankings = list(champs.values())
come_back = 0
all_releg_teams = []
for index, ranking in enumerate(rankings):
releg_teams = ranking[ranking["qual/releg"].str.contains("releg")]["team"].to_list()
all_releg_teams += releg_teams
try:
if any(team in rankings[index+2]["team"].to_list() for team in releg_teams):
come_back += [team in rankings[index+2]["team"].to_list() for team in releg_teams].count(True)
except IndexError:...
print(f"Over the {len(set(all_releg_teams))}/{len(all_releg_teams)} teams that have been relagated, {come_back} came back the next year.")
Over the 41/87 teams that have been relagated, 27 came back the next year.
Which team scored many goals against a special team?
df = df_games.groupby(["home_team", "away_team"]).sum()[["home_score", "away_score"]]
df = df.reset_index()
big6 = ["manchester city", "manchester united", "liverpool", "tottenham hotspur", "chelsea", "arsenal"]
df_team_vs_team_goals = pd.DataFrame(columns=big6, index=big6)
df_team_vs_team_count = pd.DataFrame(columns=big6, index=big6)
df_team_vs_team_goals.fillna(0, inplace=True)
df_team_vs_team_count.fillna(0, inplace=True)
for home_team in big6:
for away_team in big6:
if home_team != away_team:
sub = df_games[(df_games.home_team==home_team) & (df_games.away_team==away_team)]
df_team_vs_team_goals.loc[home_team, away_team] += sub["home_score"].sum()
df_team_vs_team_goals.loc[away_team, home_team] += sub["away_score"].sum()
df_team_vs_team_count.loc[home_team, away_team] += sub.shape[0]
df_team_vs_team_count.loc[away_team, home_team] += sub.shape[0]
list(df_team_vs_team_goals.columns)
['manchester city', 'manchester united', 'liverpool', 'tottenham hotspur', 'chelsea', 'arsenal']
fig = px.imshow(df_team_vs_team_goals,
zmin = sorted(set([x for lst in df_team_vs_team_goals.values for x in lst]))[1],
color_continuous_scale=px.colors.sequential.Blues)
fig.show()
# df_team_vs_team_count, df_team_vs_team_goals
vs_per_game = []
for index1, x in enumerate(df_team_vs_team_goals.values):
mid = []
for index2, y in enumerate(x):
# print(y/df_team_vs_team_count.values[index1][index2])
mid.append(round(y/df_team_vs_team_count.values[index1][index2], 2))
vs_per_game.append(mid)
df_team_vs_team_goals_game = pd.DataFrame(vs_per_game, columns = df_team_vs_team_goals.columns,
index = df_team_vs_team_goals.index)
df_team_vs_team_goals_game
| manchester city | manchester united | liverpool | tottenham hotspur | chelsea | arsenal | |
|---|---|---|---|---|---|---|
| manchester city | NaN | 1.27 | 1.33 | 1.42 | 1.04 | 1.19 |
| manchester united | 1.46 | NaN | 1.36 | 1.69 | 1.24 | 1.34 |
| liverpool | 1.54 | 1.16 | NaN | 1.72 | 1.26 | 1.67 |
| tottenham hotspur | 1.40 | 0.98 | 1.14 | NaN | 0.95 | 1.24 |
| chelsea | 1.48 | 1.22 | 1.21 | 1.78 | NaN | 1.34 |
| arsenal | 1.46 | 1.05 | 1.24 | 1.59 | 1.33 | NaN |
programmers = ['Alex','Nicole','Sara','Etienne','Chelsea','Jody','Marianne']
# base = datetime.datetime.today()
# dates = base - np.arange(180) * datetime.timedelta(days=1)
# z = np.random.poisson(size=(len(programmers), len(dates)))
# fig = go.Figure(data=go.Heatmap(
# z=df_team_vs_team_goals_game.values,
# x=df_team_vs_team_goals_game.index,
# y=df_team_vs_team_goals_game.columns,
# colorscale='Viridis'))
# fig.update_layout(
# title='GitHub commits per day',
# xaxis_nticks=36)
# fig.show()
# dates
I'm
total_ranking = pd.DataFrame()
for season, ranking in champs.items():
total_ranking = pd.concat([total_ranking, ranking], axis=0)
total_ranking = total_ranking.groupby("team").sum().reset_index(drop=False)
total_ranking["played"] = total_ranking["W"] + total_ranking["D"] + total_ranking["L"]
total_ranking["seasons"] = total_ranking["team"].apply(lambda x:\
[team for ranking in champs.values() for team in ranking["team"].to_list() ]\
.count(x))
for outcome in ["W", "D", "L"]:
total_ranking[f"{outcome}%"] = round(total_ranking[f"{outcome}"] / total_ranking["played"] * 100, 2)
print(total_ranking.shape)
total_ranking = total_ranking.sort_values(["W", "D"], ascending=False)
total_ranking.head(7)
(49, 13)
| team | points | W | D | L | GF | GA | GD | played | seasons | W% | D% | L% | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 26 | manchester united | 2308 | 687 | 247 | 180 | 2128 | 1009 | 1119 | 1114 | 29 | 61.67 | 22.17 | 16.16 |
| 1 | arsenal | 2072 | 597 | 281 | 236 | 1956 | 1100 | 856 | 1114 | 29 | 53.59 | 25.22 | 21.18 |
| 13 | chelsea | 2064 | 597 | 273 | 244 | 1897 | 1092 | 805 | 1114 | 29 | 53.59 | 24.51 | 21.90 |
| 24 | liverpool | 2017 | 581 | 274 | 259 | 1927 | 1121 | 806 | 1114 | 29 | 52.15 | 24.60 | 23.25 |
| 42 | tottenham hotspur | 1716 | 480 | 276 | 358 | 1676 | 1398 | 278 | 1114 | 29 | 43.09 | 24.78 | 32.14 |
| 25 | manchester city | 1536 | 444 | 204 | 276 | 1559 | 1042 | 517 | 924 | 24 | 48.05 | 22.08 | 29.87 |
| 17 | everton | 1535 | 407 | 314 | 393 | 1448 | 1415 | 33 | 1114 | 29 | 36.54 | 28.19 | 35.28 |
table(total_ranking.round(0).head(76), line_break=True)
total_ranking = total_ranking.sort_values(["W%", "D%"], ascending=False)
# I actually did 5 more minutes of research and found that. Look and the start of the commented code haha...
stacked_ratio = pd.concat([total_ranking.set_index("team")[["W%", "D%", "L%"]].stack().reset_index(),
total_ranking[["W", "D", "L"]].stack().reset_index(drop=True)], axis=1)
# stacked_ratio.columns = ["team", "outcome", "percent", "count"]
stacked_ratio = stacked_ratio.merge(total_ranking, how="left", on="team")
stacked_ratio.columns = ['Team', 'Outcome', 'Percent', 'Count', 'Overall points', 'Wins',
'Draws', 'Looses', 'Goals for', 'Goals against', 'Goals difference',
'Games played', 'Seasons played', 'Win rate', 'Draw rate', 'Loose rate']
stacked_ratio = stacked_ratio[['Team', 'Outcome', 'Percent', 'Count', 'Games played', 'Seasons played',
'Overall points', 'Wins',
'Draws', 'Looses', 'Goals for', 'Goals against', 'Goals difference']]
# Formating hover_data in plot
stacked_ratio.insert(3, " ", "")
stacked_ratio.insert(8, " ", "")
stacked_ratio.insert(12, " ", "")
for col in stacked_ratio.columns:
if col != 'Percent':
stacked_ratio[col] = stacked_ratio[col].apply(lambda x: " "+str(x))
# Changing colors of variables
color=["#00afb9", "#fdfcdc", "#F24333"]
diff = [" W%", " D%", " L%"]
colors = {}
for index, x in enumerate(color):
colors[diff[index]] = f"rgb{str(px.colors.hex_to_rgb(x))}"
fig = px.bar(capitalize(stacked_ratio), y="Percent", x="Team", color="Outcome", hover_name="Team",
color_discrete_map=colors, title="Ratio Win, Draw, Lost in % - All seasons included",
template="plotly_white",
hover_data=stacked_ratio.columns.drop("Count"))
fig.update_xaxes(title_text="Team", tickangle=-90)
fig.update_yaxes(title_text="Percentage")
fig.update_layout(legend_title="Outcome", height=700)
# py.plot(fig)
fig.show()
Manchester Unites is on another planet. Being one of the 5 team that is in PL since 1995, it's also the team with the highest win ratio, 62%. Machine.
I told you before, you can make fun of Arsenal today, but over the last 26 years, this team did an astonishing 559 wins.
I cannot believe that! Arsenal and Chelsea, two ennemies have THE EXACT SAME NUMBER of wins. Over the 1114 games played over 29 years, in Premier League, they both won 597 times and both rank at the second place but Arsenal takes the advantages with slightly more draws. Let's dive into it to see the difference over the years.
df_chelsea_arsenal = pd.DataFrame(columns=["season", "team", "points"])
cpoints = 0
apoints = 0
for season, ranking in champs.items():
cpoints += ranking[ranking["team"]=="chelsea"]["W"].squeeze()
apoints += ranking[ranking["team"]=="arsenal"]["W"].squeeze()
df_chelsea_arsenal.loc[len(df_chelsea_arsenal)] = [season, "chelsea", cpoints]
df_chelsea_arsenal.loc[len(df_chelsea_arsenal)] = [season, "arsenal", apoints]
fig = px.line(df_chelsea_arsenal, x="season", y="points",
color="team", width=1000, height=400,
title="Cumulated points")
fig.update_layout(legend_title="Team")
fig.update_xaxes(title="Season")
fig.update_yaxes(title="Cumulated points")
# py.plot(fig)
fig.show()
I think in term of crazy insight, I have quite some in this analysis. In 1992/93, Chelsea and Arsenal respectively finished the first season of Premier League at 14 and 15 points. Next season will be the 30th edition of the PRemier league so maybe Chelsea, will finally pass its historical ennemy.
The best six teams being: blablabla
Is is that good? how are they performing together vs the rest of the league.? When did the big 6 really became the big 6?
ranking
| team | points | W | D | L | GF | GA | GD | played | qual/releg | |
|---|---|---|---|---|---|---|---|---|---|---|
| 1 | manchester united | 80 | 24 | 8 | 6 | 79 | 31 | 48 | 38 | qualification for the champions league first g... |
| 2 | arsenal | 70 | 20 | 10 | 8 | 63 | 38 | 25 | 38 | qualification for the champions league first g... |
| 3 | liverpool | 69 | 20 | 9 | 9 | 71 | 39 | 32 | 38 | qualification for the champions league third q... |
| 4 | leeds united | 68 | 20 | 8 | 10 | 64 | 43 | 21 | 38 | qualification for the uefa cup first round[a] |
| 5 | ipswich town | 66 | 20 | 6 | 12 | 57 | 42 | 15 | 38 | qualification for the uefa cup first round[a] |
| 6 | chelsea | 61 | 17 | 10 | 11 | 68 | 45 | 23 | 38 | qualification for the uefa cup first round[a] |
| 7 | sunderland | 57 | 15 | 12 | 11 | 46 | 41 | 5 | 38 | |
| 8 | aston villa | 54 | 13 | 15 | 10 | 46 | 43 | 3 | 38 | qualification for the intertoto cup third round |
| 9 | charlton athletic | 52 | 14 | 10 | 14 | 50 | 57 | -7 | 38 | |
| 10 | southampton | 52 | 14 | 10 | 14 | 40 | 48 | -8 | 38 | |
| 11 | newcastle united | 51 | 14 | 9 | 15 | 44 | 50 | -6 | 38 | qualification for the intertoto cup third round |
| 12 | tottenham hotspur | 49 | 13 | 10 | 15 | 47 | 54 | -7 | 38 | |
| 13 | leicester city | 48 | 14 | 6 | 18 | 39 | 51 | -12 | 38 | |
| 14 | middlesbrough | 42 | 9 | 15 | 14 | 44 | 44 | 0 | 38 | |
| 15 | west ham united | 42 | 10 | 12 | 16 | 45 | 50 | -5 | 38 | |
| 16 | everton | 42 | 11 | 9 | 18 | 45 | 59 | -14 | 38 | |
| 17 | derby county | 42 | 10 | 12 | 16 | 37 | 59 | -22 | 38 | |
| 18 | manchester city | 34 | 8 | 10 | 20 | 41 | 65 | -24 | 38 | relegation to the football league first division |
| 19 | coventry city | 34 | 8 | 10 | 20 | 36 | 63 | -27 | 38 | relegation to the football league first division |
| 20 | bradford city | 26 | 5 | 11 | 22 | 30 | 70 | -40 | 38 | relegation to the football league first division |
# df_big6_points
df_big6_points = pd.DataFrame(columns=["season", "type", "points", "team"])
for season, ranking in champs.items():
b6_wins = ranking[ranking["team"].isin(big6)]["W"].sum() / ranking["W"].sum() * 100
b6_points = ranking[ranking["team"].isin(big6)]["points"].sum() / ranking["points"].sum() * 100
df_big6_points.loc[len(df_big6_points)] = [season, "wins", b6_points, "B6"]
df_big6_points.loc[len(df_big6_points)] = [season, "wins", 100-b6_points, "Others"]
# df_big6_points.loc[len(df_big6_points)] = [season, "points", b6_points, "B6"]
# df_big6_points.loc[len(df_big6_points)] = [season, "points", 100-b6_points, "Others"]
fig = px.bar(df_big6_points, y="points", x="season", color="team", barmode='group',
title="Distribution of points big 6 vs other teams (in %)")
fig.update_xaxes(tickangle=-90)
fig.update_xaxes(title_text="Season", tickangle=-90)
fig.update_yaxes(title_text="Percentage")
fig.update(layout_coloraxis_showscale=False)
# py.plot(fig)
fig.show()
At the beggining of the Premier League, the Big 6 was representing about 30% of the points and wins, which for a competition of 22 teams is fine. Yes, Manchester city, was not in PL from 95 to 00 but still, after they c
Before the Big 6 reigned on the England football, a 'top four' of dominant teams emerged. You saw the previous graph, Liverpool, Manchester City, Arsenal and Chelsea were perennial contenders in the division.
df_events[df_events["description"]=="yellow_card"]["clock_label"].value_counts()
89 10354
90 2183
45 873
44 486
88 459
...
2 50
5 50
3 48
1 33
0 5
Name: clock_label, Length: 91, dtype: int64
df_events["mn"] = df_events["clock_label"] + df_events["add_time"]
mn_perhalf = []
for x in tqdm(range(len(df_events))):
data = df_events.loc[x, ].to_list()
if data[1] == 1:
mn_perhalf.append(data[0]+data[-1])
else:
mn_perhalf.append(data[0]+data[-1]-45)
df_events["mn_perhalf"] = mn_perhalf
100%|██████████| 214429/214429 [00:12<00:00, 17054.77it/s]
events = {"yellow_card":"Yellow card", "red_card":"Red card", "2y_red_card":"2nd yellow --> red card",
"goal":"Goal", "penalty":"Penalty", "sub_on":"Substitution"}
df_events.replace(events)
| clock_label | phase | type | home_team_score | away_team_score | player_id | team_id | assist_id | description | match_id | add_time | mn | mn_perhalf | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 1 | play_start | 0 | 0 | NaN | NaN | NaN | NaN | 1875 | 0 | 0 | 0 |
| 1 | 5 | 1 | Goal | 0 | 1 | 1037.0 | 7.0 | 179.0 | Goal | 1875 | 0 | 5 | 10 |
| 2 | 45 | 1 | play_end | 0 | 1 | NaN | NaN | NaN | NaN | 1875 | 0 | 45 | 90 |
| 3 | 45 | 2 | play_start | 0 | 1 | NaN | NaN | NaN | NaN | 1875 | 0 | 45 | 45 |
| 4 | 64 | 2 | substitution | 0 | 1 | 1087.0 | 15.0 | NaN | Substitution | 1875 | 0 | 64 | 83 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 214424 | 90 | 2 | play_end | 2 | 2 | NaN | NaN | NaN | NaN | 8190 | 0 | 90 | 135 |
| 214425 | 0 | 1 | play_start | 0 | 0 | NaN | NaN | NaN | NaN | 8191 | 0 | 0 | 0 |
| 214426 | 45 | 1 | play_end | 0 | 1 | NaN | NaN | NaN | NaN | 8191 | 0 | 45 | 90 |
| 214427 | 45 | 2 | play_start | 0 | 1 | NaN | NaN | NaN | NaN | 8191 | 0 | 45 | 45 |
| 214428 | 90 | 2 | play_end | 0 | 1 | NaN | NaN | NaN | NaN | 8191 | 0 | 90 | 135 |
214429 rows × 13 columns
events = {"yellow_card":"Yellow card", "red_card":"Red card", "2y_red_card":"2nd yellow --> red card",
"goal":"Goal", "penalty":"Penalty"}
sub_df = df_events.replace(events)
events = list(events.values())[::-1]
data = []
for event in tqdm(events):
to_add = []
for clock_time in sub_df[sub_df["description"]==event]["mn"]:
to_add.append(clock_time)
data.append(to_add)
colors = "FFE548 FF2E1F F55A00 000000 4F6D7A"
colors = ["#"+x for x in colors.split(" ")][::-1]
distri = go.Figure()
count = 0
for data_line, color in zip(data, colors):
distri.add_trace(go.Violin(x=data_line[::-1],
line_color=color,
bandwidth=0.5,
name = events[count]))
count += 1
distri.update_traces(side='positive', width=2, points=False)
distri.update_layout(xaxis_showgrid=False, xaxis_zeroline=True)
distri.update_xaxes(title_text="Minutes in the game")
distri.update_yaxes(title_text="Event")
distri.update(layout_coloraxis_showscale=False)
distri.update_layout(title="Distribution of events over the minutes of the games",
legend_title="Event",
legend={'traceorder':'reversed'},
violingap=0.9)
# py.plot(distri)
distri.show()
100%|██████████| 5/5 [00:00<00:00, 127.54it/s]
sub_df = pd.DataFrame()
sub_df["team_id"] = df_games["home_team_id"].append(df_games["away_team_id"])
sub_df["match_id"] = df_games["match_id"].append(df_games["match_id"])
sub_df["location"] = ["home" for x in range(int(len(sub_df)/2))] + ["away" for x in range(int(len(sub_df)/2))]
sub_events = df_events[df_events.type.isin(["goal", "own_goal"])]
sub_df = sub_df.merge(sub_events, on=["team_id", "match_id"], how="right")[["location", "phase", "type"]]
sub_df = sub_df.groupby(["location", "phase"]).count()
total_goals = sum(sub_df.iloc[:2, 0].to_list() + sub_df.iloc[2:, 0].to_list())
sub_df["type"] = sub_df["type"].apply(lambda x: round(x/total_goals*100, 1))
sub_df
data = [sub_df.iloc[:2, 0].to_list()] + [sub_df.iloc[2:, 0].to_list()]
data
fig = px.imshow(data,
labels=dict(x="home/away team", y="Half", color="Goals"),
x=['Away', 'Home'],
y=["First half", "Second half"],
color_continuous_scale = px.colors.sequential.Blues)
fig.update_layout(title=f"Distribution of the goals scored in % - Total goals: {total_goals:,}")
# py.plot(fig)
fig.show()
In football, a come back is when a team is loosing at half time but manage to win the game at the end.
# df_events[~df_events["match_id"].isin(df_games["match_id"].to_list())]
missing_ids = set(df_games["match_id"].unique()) - set(df_events[df_events["type"]=="play_end"]["match_id"].unique())
missing_ids = list(missing_ids)
to_add = []
for missing_id in tqdm(missing_ids):
sub = df_events[df_events["match_id"]==missing_id]
# If there is an error, then there was no event meaning 0-0
try:
ht_score = sub[sub["clock_label"]<46].iloc[-1, 3:5]
except IndexError: ht_score = [0, 0]
ft_score = sub.iloc[-1, 3:5]
play_start1 = [0, 1, "play_start", 0, 0] + [np.nan]*4 + [missing_id]
play_end1 = [45, 1, "play_end", ht_score[0], ht_score[1]] + [np.nan]*4 + [missing_id]
play_start2 = [45, 2, "play_start", ht_score[0], ht_score[1]] + [np.nan]*4 + [missing_id]
play_end2 = [90, 2, "play_end", ft_score[0], ft_score[1]] + [np.nan]*4 + [missing_id]
to_add.extend([play_start1, play_end1, play_start2, play_end2])
to_add
0it [00:00, ?it/s]
test = df_events.copy()
sus = pd.DataFrame(to_add, columns = test.columns)
new = pd.concat([test, sus], axis = 0)
new
new[new["type"].isin(["play_end"])]
| clock_label | phase | type | home_team_score | away_team_score | player_id | team_id | assist_id | description | match_id | add_time | mn | mn_perhalf | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2 | 45 | 1 | play_end | 0 | 1 | NaN | NaN | NaN | NaN | 1875 | 0 | 45 | 90 |
| 10 | 90 | 2 | play_end | 0 | 1 | NaN | NaN | NaN | NaN | 1875 | 0 | 90 | 135 |
| 15 | 45 | 1 | play_end | 0 | 0 | NaN | NaN | NaN | NaN | 14249 | 2 | 47 | 92 |
| 35 | 90 | 2 | play_end | 3 | 0 | NaN | NaN | NaN | NaN | 14249 | 5 | 95 | 140 |
| 40 | 45 | 1 | play_end | 0 | 1 | NaN | NaN | NaN | NaN | 9597 | 3 | 48 | 93 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 214420 | 90 | 2 | play_end | 1 | 0 | NaN | NaN | NaN | NaN | 8189 | 0 | 90 | 135 |
| 214422 | 45 | 1 | play_end | 1 | 1 | NaN | NaN | NaN | NaN | 8190 | 0 | 45 | 90 |
| 214424 | 90 | 2 | play_end | 2 | 2 | NaN | NaN | NaN | NaN | 8190 | 0 | 90 | 135 |
| 214426 | 45 | 1 | play_end | 0 | 1 | NaN | NaN | NaN | NaN | 8191 | 0 | 45 | 90 |
| 214428 | 90 | 2 | play_end | 0 | 1 | NaN | NaN | NaN | NaN | 8191 | 0 | 90 | 135 |
22532 rows × 13 columns
df_come_back = new[new["type"]=="play_end"][["phase", "home_team_score", "away_team_score", "match_id"]]
df_come_back["outcome"] = df_come_back[["home_team_score", "away_team_score"]]\
.apply(lambda x: ("h" if x["home_team_score"] > x["away_team_score"] else "a") \
if x["home_team_score"] != x["away_team_score"] else "d",
axis=1)
cb_count = 0
hcb_count = 0
# for unique_id in df_come_back["match_id"].unique():
# outcome = df_come_back[df_come_back["match_id"]==]["outcome"].to_list()
games_ids = []
for index, outcome in enumerate(df_come_back["outcome"]):
if index % 2 == 0:
outcome = df_come_back.iloc[index:index+2, -1].to_list()
if "h" in outcome and "a" in outcome:
cb_count += 1
if outcome[1] == "h":
hcb_count += 1
games_ids.append(df_come_back.iloc[index, -2])
games = df_games.shape[0]
print(f"Over {games} games, there had been {cb_count} come-backs! {hcb_count/cb_count*100:.0f}% being home!")
Over 11266 games, there had been 453 come-backs! 58% being home!
count_yellow = df_events[df_events["description"]=="yellow_card"]["description"].count()
count_2yellow = df_events[df_events["description"]=="2y_red_card"]["description"].count()
print(f"Only {count_2yellow/count_yellow*100:.2f}% of yellow cards ({count_yellow:,}) lead to a second one ({count_2yellow:,}).")
# print(f"For {count_yellow:,} yellow cards given, ")
# only x of first card lead to a second
Only 1.03% of yellow cards (34,321) lead to a second one (355).